Datasource : https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-Present/ijzp-q8t2
# import libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
# load data set
chicago_crime_df = pd.read_csv(r"C:\Users\jki\Downloads\Chicago_Crime_Data-v2.csv")
chicago_crime_df.head(5)
ID | CASE_NUMBER | DATE | BLOCK | IUCR | PRIMARY_TYPE | DESCRIPTION | LOCATION_DESCRIPTION | ARREST | DOMESTIC | ... | WARD | COMMUNITY_AREA_NUMBER | FBICODE | X_COORDINATE | Y_COORDINATE | YEAR | UPDATEDON | LATITUDE | LONGITUDE | LOCATION | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3512276 | HK587712 | 08/28/2004 05:50:56 PM | 047XX S KEDZIE AVE | 890 | THEFT | FROM BUILDING | SMALL RETAIL STORE | False | False | ... | 14.0 | 58.0 | 6 | 1155838.0 | 1873050.0 | 2004 | 02/10/2018 03:50:01 PM | 41.807440 | -87.703956 | (41.8074405, -87.703955849) |
1 | 3406613 | HK456306 | 06/26/2004 12:40:00 PM | 009XX N CENTRAL PARK AVE | 820 | THEFT | $500 AND UNDER | OTHER | False | False | ... | 27.0 | 23.0 | 6 | 1152206.0 | 1906127.0 | 2004 | 02/28/2018 03:56:25 PM | 41.898280 | -87.716406 | (41.898279962, -87.716405505) |
2 | 8002131 | HT233595 | 04/04/2011 05:45:00 AM | 043XX S WABASH AVE | 820 | THEFT | $500 AND UNDER | NURSING HOME/RETIREMENT HOME | False | False | ... | 3.0 | 38.0 | 6 | 1177436.0 | 1876313.0 | 2011 | 02/10/2018 03:50:01 PM | 41.815933 | -87.624642 | (41.815933131, -87.624642127) |
3 | 7903289 | HT133522 | 12/30/2010 04:30:00 PM | 083XX S KINGSTON AVE | 840 | THEFT | FINANCIAL ID THEFT: OVER $300 | RESIDENCE | False | False | ... | 7.0 | 46.0 | 6 | 1194622.0 | 1850125.0 | 2010 | 02/10/2018 03:50:01 PM | 41.743665 | -87.562463 | (41.743665322, -87.562462756) |
4 | 10402076 | HZ138551 | 02/02/2016 07:30:00 PM | 033XX W 66TH ST | 820 | THEFT | $500 AND UNDER | ALLEY | False | False | ... | 15.0 | 66.0 | 6 | 1155240.0 | 1860661.0 | 2016 | 02/10/2018 03:50:01 PM | 41.773455 | -87.706480 | (41.773455295, -87.706480471) |
5 rows × 22 columns
# lets confirm existance of missing values
missing_values = chicago_crime_df.isna().sum()
print(missing_values)
ID 0 CASE_NUMBER 0 DATE 0 BLOCK 0 IUCR 0 PRIMARY_TYPE 0 DESCRIPTION 0 LOCATION_DESCRIPTION 0 ARREST 0 DOMESTIC 0 BEAT 0 DISTRICT 0 WARD 43 COMMUNITY_AREA_NUMBER 43 FBICODE 0 X_COORDINATE 4 Y_COORDINATE 4 YEAR 0 UPDATEDON 0 LATITUDE 4 LONGITUDE 4 LOCATION 4 dtype: int64
# lets remove missing values
chicago_crime_df.dropna(subset=['WARD','COMMUNITY_AREA_NUMBER','X_COORDINATE','Y_COORDINATE','LATITUDE','LONGITUDE','LOCATION'],inplace =True)
# lets confirm existance of missing values
missing_values = chicago_crime_df.isna().sum()
print(missing_values)
ID 0 CASE_NUMBER 0 DATE 0 BLOCK 0 IUCR 0 PRIMARY_TYPE 0 DESCRIPTION 0 LOCATION_DESCRIPTION 0 ARREST 0 DOMESTIC 0 BEAT 0 DISTRICT 0 WARD 0 COMMUNITY_AREA_NUMBER 0 FBICODE 0 X_COORDINATE 0 Y_COORDINATE 0 YEAR 0 UPDATEDON 0 LATITUDE 0 LONGITUDE 0 LOCATION 0 dtype: int64
# do we have unncessary negative values ?
chicago_crime_df.describe()
ID | BEAT | DISTRICT | WARD | COMMUNITY_AREA_NUMBER | X_COORDINATE | Y_COORDINATE | YEAR | LATITUDE | LONGITUDE | |
---|---|---|---|---|---|---|---|---|---|---|
count | 4.870000e+02 | 487.000000 | 487.00000 | 487.000000 | 487.000000 | 4.870000e+02 | 4.870000e+02 | 487.000000 | 487.000000 | 487.000000 |
mean | 6.622419e+06 | 1194.279261 | 11.36345 | 22.624230 | 37.595483 | 1.162707e+06 | 1.886352e+06 | 2008.969199 | 41.843787 | -87.678434 |
std | 2.828354e+06 | 665.280757 | 6.56601 | 13.088511 | 21.457648 | 1.637996e+04 | 3.036550e+04 | 4.668989 | 0.083528 | 0.059614 |
min | 2.114900e+04 | 111.000000 | 1.00000 | 1.000000 | 1.000000 | 1.100658e+06 | 1.814512e+06 | 2001.000000 | 41.645796 | -87.905227 |
25% | 3.978664e+06 | 711.000000 | 6.00000 | 12.000000 | 23.000000 | 1.151282e+06 | 1.860430e+06 | 2005.000000 | 41.772486 | -87.720007 |
50% | 6.780581e+06 | 1111.000000 | 11.00000 | 24.000000 | 30.000000 | 1.162315e+06 | 1.891618e+06 | 2009.000000 | 41.858444 | -87.679976 |
75% | 9.123182e+06 | 1652.500000 | 16.00000 | 32.000000 | 58.000000 | 1.174478e+06 | 1.908020e+06 | 2013.000000 | 41.903339 | -87.635440 |
max | 1.127717e+07 | 2535.000000 | 25.00000 | 50.000000 | 77.000000 | 1.204126e+06 | 1.951001e+06 | 2018.000000 | 42.021178 | -87.528223 |
# let have a look at the data types
chicago_crime_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 487 entries, 0 to 531 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 487 non-null int64 1 CASE_NUMBER 487 non-null object 2 DATE 487 non-null object 3 BLOCK 487 non-null object 4 IUCR 487 non-null object 5 PRIMARY_TYPE 487 non-null object 6 DESCRIPTION 487 non-null object 7 LOCATION_DESCRIPTION 487 non-null object 8 ARREST 487 non-null bool 9 DOMESTIC 487 non-null bool 10 BEAT 487 non-null int64 11 DISTRICT 487 non-null int64 12 WARD 487 non-null float64 13 COMMUNITY_AREA_NUMBER 487 non-null float64 14 FBICODE 487 non-null object 15 X_COORDINATE 487 non-null float64 16 Y_COORDINATE 487 non-null float64 17 YEAR 487 non-null int64 18 UPDATEDON 487 non-null object 19 LATITUDE 487 non-null float64 20 LONGITUDE 487 non-null float64 21 LOCATION 487 non-null object dtypes: bool(2), float64(6), int64(4), object(10) memory usage: 80.8+ KB
# let change the date data type
chicago_crime_df['YEAR'] = pd.to_datetime(chicago_crime_df['YEAR'])
chicago_crime_df['YEAR'].info()
<class 'pandas.core.series.Series'> Int64Index: 487 entries, 0 to 531 Series name: YEAR Non-Null Count Dtype -------------- ----- 487 non-null datetime64[ns] dtypes: datetime64[ns](1) memory usage: 7.6 KB
print(chicago_crime_df.head(5))
ID CASE_NUMBER DATE BLOCK \ 0 3512276 HK587712 08/28/2004 05:50:56 PM 047XX S KEDZIE AVE 1 3406613 HK456306 06/26/2004 12:40:00 PM 009XX N CENTRAL PARK AVE 2 8002131 HT233595 04/04/2011 05:45:00 AM 043XX S WABASH AVE 3 7903289 HT133522 12/30/2010 04:30:00 PM 083XX S KINGSTON AVE 4 10402076 HZ138551 02/02/2016 07:30:00 PM 033XX W 66TH ST IUCR PRIMARY_TYPE DESCRIPTION \ 0 890 THEFT FROM BUILDING 1 820 THEFT $500 AND UNDER 2 820 THEFT $500 AND UNDER 3 840 THEFT FINANCIAL ID THEFT: OVER $300 4 820 THEFT $500 AND UNDER LOCATION_DESCRIPTION ARREST DOMESTIC ... WARD \ 0 SMALL RETAIL STORE False False ... 14.0 1 OTHER False False ... 27.0 2 NURSING HOME/RETIREMENT HOME False False ... 3.0 3 RESIDENCE False False ... 7.0 4 ALLEY False False ... 15.0 COMMUNITY_AREA_NUMBER FBICODE X_COORDINATE Y_COORDINATE \ 0 58.0 6 1155838.0 1873050.0 1 23.0 6 1152206.0 1906127.0 2 38.0 6 1177436.0 1876313.0 3 46.0 6 1194622.0 1850125.0 4 66.0 6 1155240.0 1860661.0 YEAR UPDATEDON LATITUDE LONGITUDE \ 0 1970-01-01 00:00:00.000002004 02/10/2018 03:50:01 PM 41.807440 -87.703956 1 1970-01-01 00:00:00.000002004 02/28/2018 03:56:25 PM 41.898280 -87.716406 2 1970-01-01 00:00:00.000002011 02/10/2018 03:50:01 PM 41.815933 -87.624642 3 1970-01-01 00:00:00.000002010 02/10/2018 03:50:01 PM 41.743665 -87.562463 4 1970-01-01 00:00:00.000002016 02/10/2018 03:50:01 PM 41.773455 -87.706480 LOCATION 0 (41.8074405, -87.703955849) 1 (41.898279962, -87.716405505) 2 (41.815933131, -87.624642127) 3 (41.743665322, -87.562462756) 4 (41.773455295, -87.706480471) [5 rows x 22 columns]
# Find the minimum date
oldest_date = chicago_crime_df['DATE'].min()
print("Oldest data point in the dataset:", oldest_date)
Oldest data point in the dataset: 01/01/2010 01:22:08 AM
import pandas as pd
# Assuming 'chicago_crime_df' is your DataFrame containing the dataset
# Convert the 'DATE' column to datetime format
chicago_crime_df['DATE'] = pd.to_datetime(chicago_crime_df['DATE'], format='%m/%d/%Y %I:%M:%S %p')
# Extract the year from the 'DATE' column
chicago_crime_df['YEAR'] = chicago_crime_df['DATE'].dt.year
# Group by year and count the number of crimes in each year
crimes_by_year = chicago_crime_df.groupby('YEAR').size()
# Find the year with the largest number of crimes
max_crimes_year = crimes_by_year.idxmax()
max_crimes_count = crimes_by_year.max()
print("Year with the largest amount of crimes:", max_crimes_year)
print("Number of crimes committed that year:", max_crimes_count)
Year with the largest amount of crimes: 2005 Number of crimes committed that year: 44
## Assuming 'chicago_crime_df' is your DataFrame containing the dataset
# Filter the dataset for the year 2020
crimes_2020 = chicago_crime_df[chicago_crime_df['YEAR'] == 2020]
# Check if there are any crimes recorded for the year 2020
if not crimes_2020.empty:
# Count the occurrences of each crime type in 2020
crime_counts_2020 = crimes_2020['PRIMARY_TYPE'].value_counts()
# Find the five most common crimes in 2020
top_5_crimes_2020 = crime_counts_2020.head(5)
print("Five most common crimes in 2020:")
print(top_5_crimes_2020)
# Calculate the arrest rate for each crime type
arrest_rates_2020 = {}
for crime_type in top_5_crimes_2020.index:
total_crimes = crime_counts_2020[crime_type]
arrests = crimes_2020[crimes_2020['PRIMARY_TYPE'] == crime_type]['ARREST'].sum()
arrest_rate = arrests / total_crimes
arrest_rates_2020[crime_type] = arrest_rate
# Find the crime with the highest and lowest arrest rates among the top 5
if arrest_rates_2020:
highest_arrest_rate_crime = max(arrest_rates_2020, key=arrest_rates_2020.get)
lowest_arrest_rate_crime = min(arrest_rates_2020, key=arrest_rates_2020.get)
print("\nCrime with the highest arrest rate among the top 5:", highest_arrest_rate_crime)
print("Arrest rate:", arrest_rates_2020[highest_arrest_rate_crime])
print("\nCrime with the lowest arrest rate among the top 5:", lowest_arrest_rate_crime)
print("Arrest rate:", arrest_rates_2020[lowest_arrest_rate_crime])
else:
print("No crimes recorded for the year 2020.")
else:
print("No crimes recorded for the year 2020.")
No crimes recorded for the year 2020.
import matplotlib.pyplot as plt
# Calculate the total number of crimes and total number of arrests per year
crimes_by_year = chicago_crime_df.groupby('YEAR').size()
arrests_by_year = chicago_crime_df.groupby('YEAR')['ARREST'].sum()
# Calculate arrest rates per year
arrest_rates_by_year = arrests_by_year / crimes_by_year
# Find the year with the highest arrest rate
year_highest_arrest_rate = arrest_rates_by_year.idxmax()
highest_arrest_rate = arrest_rates_by_year.max()
print("Year with the highest arrest rate:", year_highest_arrest_rate)
print("Highest arrest rate:", highest_arrest_rate)
# Plot the number of crimes per year
plt.figure(figsize=(10, 6))
plt.plot(crimes_by_year.index, crimes_by_year.values, marker='o', linestyle='-')
plt.title('Number of Crimes per Year')
plt.xlabel('Year')
plt.ylabel('Number of Crimes')
plt.grid(True)
plt.show()
Year with the highest arrest rate: 2001 Highest arrest rate: 1.0
import matplotlib.pyplot as plt
# Calculate the total number of arrests per year
arrests_by_year = chicago_crime_df.groupby('YEAR')['ARREST'].sum()
# Find the year with the most arrests
year_most_arrests = arrests_by_year.idxmax()
most_arrests = arrests_by_year.max()
print("Year with the most arrests:", year_most_arrests)
print("Number of arrests made during that year:", most_arrests)
# Plot the trend for total number of arrests per year
plt.figure(figsize=(10, 6))
plt.plot(arrests_by_year.index, arrests_by_year.values, marker='o', linestyle='-')
plt.title('Total Number of Arrests per Year')
plt.xlabel('Year')
plt.ylabel('Total Number of Arrests')
plt.grid(True)
plt.show()
Year with the most arrests: 2005 Number of arrests made during that year: 18
import matplotlib.pyplot as plt
# Calculate the total number of crimes and total number of arrests per year
crimes_by_year = chicago_crime_df.groupby('YEAR').size()
arrests_by_year = chicago_crime_df.groupby('YEAR')['ARREST'].sum()
# Calculate arrest rates per year
arrest_rates_by_year = arrests_by_year / crimes_by_year
# Plot the trend for arrest rate over time
plt.figure(figsize=(10, 6))
plt.plot(arrest_rates_by_year.index, arrest_rates_by_year.values, marker='o', linestyle='-')
plt.title('Arrest Rate Over Time')
plt.xlabel('Year')
plt.ylabel('Arrest Rate')
plt.grid(True)
plt.show()
# Find the years with the biggest change in arrest rate
max_change = 0
year_start_max_change = None
year_end_max_change = None
for i in range(1, len(arrest_rates_by_year)):
change = abs(arrest_rates_by_year.iloc[i] - arrest_rates_by_year.iloc[i - 1])
if change > max_change:
max_change = change
year_start_max_change = arrest_rates_by_year.index[i - 1]
year_end_max_change = arrest_rates_by_year.index[i]
print("Between which years can you see the biggest change in Arrest Rate?")
print("Between {} and {}".format(year_start_max_change, year_end_max_change))
# Analysis and comments on the conclusions
# Please note that specific reasons for changes in arrest rate may vary and require further investigation,
# such as changes in policing strategies, crime reporting methods, community relations, etc.
print("\nPossible reasons for the drop in arrest rate between {} and {}:".format(year_start_max_change, year_end_max_change))
print("- Changes in law enforcement policies or priorities")
print("- Changes in community relations or trust in law enforcement")
print("- Changes in crime reporting methods or data collection processes")
print("- Socioeconomic factors impacting crime rates and law enforcement effectiveness")
# Add more specific reasons as per the context of the dataset and external factors influencing crime and law enforcement.
Between which years can you see the biggest change in Arrest Rate? Between 2001 and 2002 Possible reasons for the drop in arrest rate between 2001 and 2002: - Changes in law enforcement policies or priorities - Changes in community relations or trust in law enforcement - Changes in crime reporting methods or data collection processes - Socioeconomic factors impacting crime rates and law enforcement effectiveness